Author

Emily Lo

Code
library(tidyverse)
library(here)
library(dplyr)
library(tidyr)
library(broom)
library(gt)
library(glue)
library(DT)
library(janitor)
Code
names <- read_csv(here("week 9", "lab 9", "StateNames_A.csv"))

3 Summarizing & Visualizing the Number of Allisons

Code
#data cleaning
cleaned_names <- names |>
  rename(Sex = Gender)
datatable(cleaned_names)
Code
cleaned_names_table <- cleaned_names |>
  filter(Name == "Allison") |>
  group_by(State, Sex) |>
  summarize(Count = sum(Count), 
            .groups = "drop") |>
  pivot_wider(names_from = Sex, 
              values_from = Count, 
              values_fill = 0) |>
  gt() |>
  tab_header(
    title = md("**Frequency Table of Babies Named Allison**"),
    subtitle = "for 192,825 babies"
  ) |>
  cols_label(
    F = "Female",
    M = "Male")

cleaned_names_table
Frequency Table of Babies Named Allison
for 192,825 babies
State Female Male
AK 232 0
AL 1535 0
AR 1198 0
AZ 1880 0
CA 12413 0
CO 1594 0
CT 1099 0
DC 321 0
DE 294 0
FL 4455 0
GA 3257 0
HI 183 0
IA 1477 0
ID 451 0
IL 5110 0
IN 3067 0
KS 1283 0
KY 1905 20
LA 1209 0
MA 2218 0
MD 2229 0
ME 340 0
MI 4014 0
MN 2374 0
MO 2882 0
MS 817 0
MT 226 0
NC 3435 0
ND 285 0
NE 807 0
NH 412 0
NJ 3052 0
NM 399 0
NV 729 0
NY 5747 0
OH 5487 0
OK 1421 0
OR 1186 0
PA 4307 0
RI 306 0
SC 1228 0
SD 376 0
TN 2488 0
TX 10192 0
UT 1125 0
VA 3220 0
VT 135 0
WA 1956 0
WI 2367 0
WV 813 0
WY 142 0
  1. You should have seen in the table above that “Allison” is a name given overwhelmingly to babies assigned “female” at birth. So, create a new dataset named allison_f which contains only the babies assigned Female at birth.
Code
allison_f <- cleaned_names |>
  filter(Name == "Allison", 
         Sex == "F")
  1. Make a visualization showing how the popularity of the name “Allison” has changed over the years. To be clear, each year should have one observation–the total number of Allisons born that year.
Code
allison_totals <- allison_f |> 
  group_by(Year) |>
  summarize(total = sum(Count))

ggplot(data = allison_totals, 
       mapping = aes(x = Year, 
                     y = total)) +
geom_line() +
  labs(title = "Popularity Total of the Name Allisons Over the years",
       x = "Year",
       y = "")

4 Modeling the Number of Allisons

Fit a linear model with the year as the explanatory variable, and the number of Allisons as the response. Similar to #3, each year should have one observation–the total number of Allisons born that year.

Code
allison_lm <- lm(total ~ Year, 
                 data = allison_totals)

allison_tidy_table <- broom::tidy(allison_lm) |>
  gt() |>
  tab_header(
    title = md("**Regression Results Table**")) |> 
  tab_style(
    style = list(
      cell_text(font = "Times", 
                weight = "lighter", 
                size = "11px")),
    locations = cells_body(columns = everything(), 
                           rows = 1:2)
  )
  
allison_tidy_table
Regression Results Table
term estimate std.error statistic p.value
(Intercept) 209689.7609 42971.50497 4.879740 0.0001669256
Year -101.5191 21.42676 -4.737959 0.0002228430
  1. Predicted total Allisons = -102(Year) + 209690
Code
allison_residual <- augment(allison_lm, 
                            newdata = allison_totals) 

ggplot(data = allison_residual, 
       mapping = aes(x = Year, 
                     y = .resid)) + 
  geom_point() +
  labs(title = "Residuals of the Total Allisons over the Years",
       x = "Year",
       y = "")

For the earlier years, residuals are higher, meaning that we may have under-predicted the popularity of the name Allison. For later years, residuals are lower, meaning that we may have over-predicted the popularity of the name Allison

  1. I conclude that your name’s popularity has been decreasing from 1998 to 2005 and increased a bit from 2005 to 2009 and decreased again from 2009 to 2014. Your name is not as popular as it was in the earlier years.

Code
Name_order <- c("Alan", "Allen", "Allan")

male_names_totals <- cleaned_names |> 
  filter(Name %in% c("Allen", 
         "Alan", 
         "Allan")) |>
  group_by(Name, Year) |>
  summarize(total = sum(Count)) |>

mutate(Name = factor(Name, 
                     levels = Name_order))
ggplot(data = male_names_totals, 
       mapping = aes(x = Year, 
                     y = total, 
                     color = Name)) + 
  geom_line() +
  labs(title = "Popularity Total of the Names: Allan, Alan, and Allen Over the Years",
       x = "Year",
       y = "")

Code
Allan_totals <- cleaned_names |> 
  filter(Year == 2000,
         State %in% c("CA","PA"),
         Name %in% c("Allan", 
         "Alan", 
         "Allen")) |>
  group_by(State, Name) |>
  summarize(total = sum(Count), 
            .groups = "drop") |>
  pivot_wider(names_from = Name, 
              values_from = total,
              values_fill = 0)

Allan_table <- gt(Allan_totals) |>
  tab_header(
    title = md("**Frequency of Allans**"),
    subtitle = ("Year: 2000")) |>
  tab_style(
    style = list(
      cell_borders(sides = "all", 
                   color = "#87CEEB",
                   weight = px(1.5),
                   style = "solid"
                     ),
      cell_fill(color = "#FFC0CB", alpha = NULL),
      cell_text(font = "Impact", 
                weight = "lighter", 
                size = "14px")),
    locations = cells_body()) 

Allan_table
Frequency of Allans
Year: 2000
State Alan Allan Allen
CA 584 131 176
PA 51 12 56
Code
# I learned how to tab_style on https://gt.rstudio.com/reference/tab_style.html. I think that it can make your table more creative. 

10.

Code
Allan_total_percentage <- cleaned_names |> 
  filter(Year == 2000,
         State %in% c("CA","PA"),
         Name %in% c("Allan", "Alan", "Allen")) |>
  group_by(State, Name) |>
  summarize(total = sum(Count), 
            .groups = "drop") |>
  mutate(prop = total / sum(total)) |>
  group_by(State) |>
  ungroup() |> 
  select(-total) |>
  pivot_wider(names_from = Name, 
              values_from = prop, 
              values_fill = 0)

Allan_percent_table <- gt(Allan_total_percentage) |>
  fmt_percent(columns = 2:4, decimals = 2) |>
  tab_header(
    title = md("**<span style = 'color:#BA55D3;'>Frequency of Allans</span>**"),
    subtitle = ("Year: 2000")) |>
  tab_style(
    style = list(
      cell_text(font = "Georgia",
                weight = "bold",
                size = "14px",
                align = "right")),
    locations = cells_body())


Allan_percent_table
Frequency of Allans
Year: 2000
State Alan Allan Allen
CA 57.82% 12.97% 17.43%
PA 5.05% 1.19% 5.54%